In [1]:
#Import all packages needed
import pandas as pd
import numpy as np
import requests
import tweepy 
import json 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

Gathering Data¶

In [2]:
#Read CSV file 
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
twitter_archive.head()
Out[2]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... 13 10 Phineas None None None None
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... 13 10 Tilly None None None None
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN NaN NaN https://twitter.com/dog_rates/status/891815181... 12 10 Archie None None None None
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN NaN NaN https://twitter.com/dog_rates/status/891689557... 13 10 Darla None None None None
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN NaN NaN https://twitter.com/dog_rates/status/891327558... 12 10 Franklin None None None None
In [3]:
url="https://video.udacity-data.com/topher/2018/November/5bf60c69_image-predictions-3/image-predictions-3.tsv"
response = requests.get(url)
with open('image_prediction.tsv', 'wb') as file:
    file.write(response.content)
    
image_prediction = pd.read_csv('image_prediction.tsv', sep='\t')
image_prediction.head()
Out[3]:
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh_springer_spaniel 0.465074 True collie 0.156665 True Shetland_sheepdog 0.061428 True
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 redbone 0.506826 True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True
2 666033412701032449 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 German_shepherd 0.596461 True malinois 0.138584 True bloodhound 0.116197 True
3 666044226329800704 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 Rhodesian_ridgeback 0.408143 True redbone 0.360687 True miniature_pinscher 0.222752 True
4 666049248165822465 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 miniature_pinscher 0.560311 True Rottweiler 0.243682 True Doberman 0.154629 True
In [4]:
api_key = 'Zqkjh7AJq6hLnL5X6UHfzhBES'
api_key_secret = 'aEqG0URRpopHmE2msx40gBWkqJvGjeV3w6BveAi0gVVCRFHnDi'
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAKWQgQEAAAAAx7ngtZv7r12IB8IfFgibsKHWedo%3DowzJieJ0c022GguMLQTFpJ8p7vSTtxIMiwz6pPJHTO5R9esrrz'
access_token = '206365523-aFjx2vsSDJdDcRsC12gVyWenQHdWOKjWECgA1BDy'
access_token_secret = 'zab4zuB0NP6rx94DWu2Fe108DonaJE3hparfqGA9uTTYq'
In [5]:
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token = (access_token, access_token_secret)
api = tweepy.API(auth, 
                 parser = tweepy.parsers.JSONParser())
In [6]:
full_status = []
not_found_tweets = []
for tweet_id in twitter_archive['tweet_id']:
    try:
        full_status.append(api.get_status(tweet_id))
    except Exception as e:
        not_found_tweets.append(tweet_id)
In [7]:
full_status[0:2]
Out[7]:
[{'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
  'id': 892420643555336193,
  'id_str': '892420643555336193',
  'text': "This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU",
  'truncated': False,
  'entities': {'hashtags': [],
   'symbols': [],
   'user_mentions': [],
   'urls': [],
   'media': [{'id': 892420639486877696,
     'id_str': '892420639486877696',
     'indices': [86, 109],
     'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
     'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
     'url': 'https://t.co/MgUWQ76dJU',
     'display_url': 'pic.twitter.com/MgUWQ76dJU',
     'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
     'type': 'photo',
     'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
      'medium': {'w': 540, 'h': 528, 'resize': 'fit'},
      'small': {'w': 540, 'h': 528, 'resize': 'fit'},
      'large': {'w': 540, 'h': 528, 'resize': 'fit'}}}]},
  'extended_entities': {'media': [{'id': 892420639486877696,
     'id_str': '892420639486877696',
     'indices': [86, 109],
     'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
     'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
     'url': 'https://t.co/MgUWQ76dJU',
     'display_url': 'pic.twitter.com/MgUWQ76dJU',
     'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
     'type': 'photo',
     'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
      'medium': {'w': 540, 'h': 528, 'resize': 'fit'},
      'small': {'w': 540, 'h': 528, 'resize': 'fit'},
      'large': {'w': 540, 'h': 528, 'resize': 'fit'}}}]},
  'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'in_reply_to_screen_name': None,
  'user': {'id': 4196983835,
   'id_str': '4196983835',
   'name': 'WeRateDogs®',
   'screen_name': 'dog_rates',
   'location': 'all our links ➜',
   'description': 'Your Only Source For Professional Dog Ratings Instagram and Facebook ➜ WeRateDogs partnerships@weratedogs.com | nonprofit: @15outof10 ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀',
   'url': 'https://t.co/YPc2Xq4Va2',
   'entities': {'url': {'urls': [{'url': 'https://t.co/YPc2Xq4Va2',
       'expanded_url': 'http://links.weratedogs.com',
       'display_url': 'links.weratedogs.com',
       'indices': [0, 23]}]},
    'description': {'urls': []}},
   'protected': False,
   'followers_count': 9357103,
   'friends_count': 21,
   'listed_count': 7614,
   'created_at': 'Sun Nov 15 21:41:29 +0000 2015',
   'favourites_count': 147566,
   'utc_offset': None,
   'time_zone': None,
   'geo_enabled': True,
   'verified': True,
   'statuses_count': 16172,
   'lang': None,
   'contributors_enabled': False,
   'is_translator': False,
   'is_translation_enabled': False,
   'profile_background_color': '000000',
   'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
   'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
   'profile_background_tile': False,
   'profile_image_url': 'http://pbs.twimg.com/profile_images/1552995729014247425/TaJbIdmK_normal.jpg',
   'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1552995729014247425/TaJbIdmK_normal.jpg',
   'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4196983835/1661991479',
   'profile_link_color': 'F5ABB5',
   'profile_sidebar_border_color': '000000',
   'profile_sidebar_fill_color': '000000',
   'profile_text_color': '000000',
   'profile_use_background_image': False,
   'has_extended_profile': False,
   'default_profile': False,
   'default_profile_image': False,
   'following': None,
   'follow_request_sent': None,
   'notifications': None,
   'translator_type': 'none',
   'withheld_in_countries': []},
  'geo': None,
  'coordinates': None,
  'place': None,
  'contributors': None,
  'is_quote_status': False,
  'retweet_count': 6973,
  'favorite_count': 33703,
  'favorited': False,
  'retweeted': False,
  'possibly_sensitive': False,
  'possibly_sensitive_appealable': False,
  'lang': 'en'},
 {'created_at': 'Tue Aug 01 00:17:27 +0000 2017',
  'id': 892177421306343426,
  'id_str': '892177421306343426',
  'text': "This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boop… https://t.co/aQFSeaCu9L",
  'truncated': True,
  'entities': {'hashtags': [],
   'symbols': [],
   'user_mentions': [],
   'urls': [{'url': 'https://t.co/aQFSeaCu9L',
     'expanded_url': 'https://twitter.com/i/web/status/892177421306343426',
     'display_url': 'twitter.com/i/web/status/8…',
     'indices': [117, 140]}]},
  'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'in_reply_to_screen_name': None,
  'user': {'id': 4196983835,
   'id_str': '4196983835',
   'name': 'WeRateDogs®',
   'screen_name': 'dog_rates',
   'location': 'all our links ➜',
   'description': 'Your Only Source For Professional Dog Ratings Instagram and Facebook ➜ WeRateDogs partnerships@weratedogs.com | nonprofit: @15outof10 ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀',
   'url': 'https://t.co/YPc2Xq4Va2',
   'entities': {'url': {'urls': [{'url': 'https://t.co/YPc2Xq4Va2',
       'expanded_url': 'http://links.weratedogs.com',
       'display_url': 'links.weratedogs.com',
       'indices': [0, 23]}]},
    'description': {'urls': []}},
   'protected': False,
   'followers_count': 9357103,
   'friends_count': 21,
   'listed_count': 7614,
   'created_at': 'Sun Nov 15 21:41:29 +0000 2015',
   'favourites_count': 147566,
   'utc_offset': None,
   'time_zone': None,
   'geo_enabled': True,
   'verified': True,
   'statuses_count': 16172,
   'lang': None,
   'contributors_enabled': False,
   'is_translator': False,
   'is_translation_enabled': False,
   'profile_background_color': '000000',
   'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
   'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
   'profile_background_tile': False,
   'profile_image_url': 'http://pbs.twimg.com/profile_images/1552995729014247425/TaJbIdmK_normal.jpg',
   'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1552995729014247425/TaJbIdmK_normal.jpg',
   'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4196983835/1661991479',
   'profile_link_color': 'F5ABB5',
   'profile_sidebar_border_color': '000000',
   'profile_sidebar_fill_color': '000000',
   'profile_text_color': '000000',
   'profile_use_background_image': False,
   'has_extended_profile': False,
   'default_profile': False,
   'default_profile_image': False,
   'following': None,
   'follow_request_sent': None,
   'notifications': None,
   'translator_type': 'none',
   'withheld_in_countries': []},
  'geo': None,
  'coordinates': None,
  'place': None,
  'contributors': None,
  'is_quote_status': False,
  'retweet_count': 5276,
  'favorite_count': 29229,
  'favorited': False,
  'retweeted': False,
  'possibly_sensitive': False,
  'possibly_sensitive_appealable': False,
  'lang': 'en'}]
In [8]:
len(full_status)
Out[8]:
1615
In [9]:
with open('tweet_json.txt', 'w') as file:
        file.write(json.dumps(full_status, indent = 2))
In [10]:
full_json = pd.DataFrame(full_status)
full_json.head(1)
Out[10]:
created_at id id_str text truncated entities extended_entities source in_reply_to_status_id in_reply_to_status_id_str ... favorite_count favorited retweeted possibly_sensitive possibly_sensitive_appealable lang retweeted_status quoted_status_id quoted_status_id_str quoted_status
0 Tue Aug 01 16:23:56 +0000 2017 892420643555336193 892420643555336193 This is Phineas. He's a mystical boy. Only eve... False {'hashtags': [], 'symbols': [], 'user_mentions... {'media': [{'id': 892420639486877696, 'id_str'... <a href="http://twitter.com/download/iphone" r... NaN None ... 33703 False False False False en NaN NaN NaN NaN

1 rows × 30 columns

In [11]:
full_json.columns
Out[11]:
Index(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities',
       'extended_entities', 'source', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo',
       'coordinates', 'place', 'contributors', 'is_quote_status',
       'retweet_count', 'favorite_count', 'favorited', 'retweeted',
       'possibly_sensitive', 'possibly_sensitive_appealable', 'lang',
       'retweeted_status', 'quoted_status_id', 'quoted_status_id_str',
       'quoted_status'],
      dtype='object')
In [12]:
json_tweets = full_json[['id', 'retweet_count','favorite_count']]
json_tweets
Out[12]:
id retweet_count favorite_count
0 892420643555336193 6973 33703
1 892177421306343426 5276 29229
2 891815181378084864 3465 21976
3 891689557279858688 7193 36799
4 891327558926688256 7719 35186
... ... ... ...
1610 666049248165822465 36 88
1611 666044226329800704 115 246
1612 666033412701032449 36 100
1613 666029285002620928 39 112
1614 666020888022790149 419 2284

1615 rows × 3 columns

In [13]:
json_tweets.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615 entries, 0 to 1614
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   id              1615 non-null   int64
 1   retweet_count   1615 non-null   int64
 2   favorite_count  1615 non-null   int64
dtypes: int64(3)
memory usage: 38.0 KB
In [14]:
json_tweets.to_csv('json_tweets.csv', header = True, index = False)

Assessing data¶

Visual assessing¶

In [15]:
#Visual assessment of twitter archive data
twitter_archive
Out[15]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... 13 10 Phineas None None None None
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... 13 10 Tilly None None None None
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN NaN NaN https://twitter.com/dog_rates/status/891815181... 12 10 Archie None None None None
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN NaN NaN https://twitter.com/dog_rates/status/891689557... 13 10 Darla None None None None
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN NaN NaN https://twitter.com/dog_rates/status/891327558... 12 10 Franklin None None None None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2351 666049248165822465 NaN NaN 2015-11-16 00:24:50 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a 1949 1st generation vulpix. Enj... NaN NaN NaN https://twitter.com/dog_rates/status/666049248... 5 10 None None None None None
2352 666044226329800704 NaN NaN 2015-11-16 00:04:52 +0000 <a href="http://twitter.com/download/iphone" r... This is a purebred Piers Morgan. Loves to Netf... NaN NaN NaN https://twitter.com/dog_rates/status/666044226... 6 10 a None None None None
2353 666033412701032449 NaN NaN 2015-11-15 23:21:54 +0000 <a href="http://twitter.com/download/iphone" r... Here is a very happy pup. Big fan of well-main... NaN NaN NaN https://twitter.com/dog_rates/status/666033412... 9 10 a None None None None
2354 666029285002620928 NaN NaN 2015-11-15 23:05:30 +0000 <a href="http://twitter.com/download/iphone" r... This is a western brown Mitsubishi terrier. Up... NaN NaN NaN https://twitter.com/dog_rates/status/666029285... 7 10 a None None None None
2355 666020888022790149 NaN NaN 2015-11-15 22:32:08 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a Japanese Irish Setter. Lost eye... NaN NaN NaN https://twitter.com/dog_rates/status/666020888... 8 10 None None None None None

2356 rows × 17 columns

In [16]:
#Visual assessment of twitter image prediction
image_prediction
Out[16]:
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh_springer_spaniel 0.465074 True collie 0.156665 True Shetland_sheepdog 0.061428 True
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 redbone 0.506826 True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True
2 666033412701032449 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 German_shepherd 0.596461 True malinois 0.138584 True bloodhound 0.116197 True
3 666044226329800704 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 Rhodesian_ridgeback 0.408143 True redbone 0.360687 True miniature_pinscher 0.222752 True
4 666049248165822465 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 miniature_pinscher 0.560311 True Rottweiler 0.243682 True Doberman 0.154629 True
... ... ... ... ... ... ... ... ... ... ... ... ...
2070 891327558926688256 https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg 2 basset 0.555712 True English_springer 0.225770 True German_short-haired_pointer 0.175219 True
2071 891689557279858688 https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg 1 paper_towel 0.170278 False Labrador_retriever 0.168086 True spatula 0.040836 False
2072 891815181378084864 https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg 1 Chihuahua 0.716012 True malamute 0.078253 True kelpie 0.031379 True
2073 892177421306343426 https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1 Chihuahua 0.323581 True Pekinese 0.090647 True papillon 0.068957 True
2074 892420643555336193 https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1 orange 0.097049 False bagel 0.085851 False banana 0.076110 False

2075 rows × 12 columns

In [17]:
#Visual assessment of twitter json tweets
json_tweets
Out[17]:
id retweet_count favorite_count
0 892420643555336193 6973 33703
1 892177421306343426 5276 29229
2 891815181378084864 3465 21976
3 891689557279858688 7193 36799
4 891327558926688256 7719 35186
... ... ... ...
1610 666049248165822465 36 88
1611 666044226329800704 115 246
1612 666033412701032449 36 100
1613 666029285002620928 39 112
1614 666020888022790149 419 2284

1615 rows × 3 columns

Programmatic assessing¶

1 - Twitter Archive DataFrame¶

In [18]:
#Getting Twitter Archive columns datatypes and null values
twitter_archive.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 non-null   object 
 14  floofer                     2356 non-null   object 
 15  pupper                      2356 non-null   object 
 16  puppo                       2356 non-null   object 
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB
In [19]:
#Checking if there's duplicated rows in the table
twitter_archive.duplicated().sum()
Out[19]:
0
In [20]:
#Checking duplicated values in tweet_id column 
twitter_archive.tweet_id.duplicated().sum()
Out[20]:
0
In [21]:
#Checking dogs names values for consistency
twitter_archive.name.value_counts()
Out[21]:
None          745
a              55
Charlie        12
Cooper         11
Lucy           11
             ... 
Dex             1
Ace             1
Tayzie          1
Grizzie         1
Christoper      1
Name: name, Length: 957, dtype: int64
In [22]:
twitter_archive.name.sort_values().unique()
Out[22]:
array(['Abby', 'Ace', 'Acro', 'Adele', 'Aiden', 'Aja', 'Akumi', 'Al',
       'Albert', 'Albus', 'Aldrick', 'Alejandro', 'Alexander',
       'Alexanderson', 'Alf', 'Alfie', 'Alfy', 'Alice', 'Amber',
       'Ambrose', 'Amy', 'Amélie', 'Anakin', 'Andru', 'Andy', 'Angel',
       'Anna', 'Anthony', 'Antony', 'Apollo', 'Aqua', 'Archie', 'Arlen',
       'Arlo', 'Arnie', 'Arnold', 'Arya', 'Ash', 'Asher', 'Ashleigh',
       'Aspen', 'Astrid', 'Atlas', 'Atticus', 'Aubie', 'Augie', 'Autumn',
       'Ava', 'Axel', 'Bailey', 'Baloo', 'Balto', 'Banditt', 'Banjo',
       'Barclay', 'Barney', 'Baron', 'Barry', 'Batdog', 'Bauer', 'Baxter',
       'Bayley', 'BeBe', 'Bear', 'Beau', 'Beckham', 'Beebop', 'Beemo',
       'Bell', 'Bella', 'Belle', 'Ben', 'Benedict', 'Benji', 'Benny',
       'Bentley', 'Berb', 'Berkeley', 'Bernie', 'Bert', 'Bertson',
       'Betty', 'Beya', 'Biden', 'Bilbo', 'Billl', 'Billy', 'Binky',
       'Birf', 'Bisquick', 'Blakely', 'Blanket', 'Blipson', 'Blitz',
       'Bloo', 'Bloop', 'Blu', 'Blue', 'Bluebert', 'Bo', 'Bob', 'Bobb',
       'Bobbay', 'Bobble', 'Bobby', 'Bode', 'Bodie', 'Bonaparte', 'Bones',
       'Bookstore', 'Boomer', 'Boots', 'Boston', 'Bowie', 'Brad',
       'Bradlay', 'Bradley', 'Brady', 'Brandi', 'Brandonald', 'Brandy',
       'Brat', 'Brian', 'Brockly', 'Brody', 'Bronte', 'Brooks', 'Brownie',
       'Bruce', 'Brudge', 'Bruiser', 'Bruno', 'Brutus', 'Bubba',
       'Bubbles', 'Buckley', 'Buddah', 'Buddy', 'Bungalo', 'Burt',
       'Butter', 'Butters', 'Cal', 'Calbert', 'Cali', 'Callie', 'Calvin',
       'Canela', 'Cannon', 'Carbon', 'Carl', 'Carll', 'Carly', 'Carper',
       'Carter', 'Caryl', 'Cash', 'Cassie', 'CeCe', 'Cecil', 'Cedrick',
       'Cermet', 'Chadrick', 'Champ', 'Charl', 'Charles', 'Charleson',
       'Charlie', 'Chase', 'Chaz', 'Cheesy', 'Chef', 'Chelsea', 'Cheryl',
       'Chesney', 'Chester', 'Chesterson', 'Chet', 'Chevy', 'Chip',
       'Chipson', 'Chloe', 'Chompsky', 'Christoper', 'Chubbs', 'Chuck',
       'Chuckles', 'Chuq', 'Churlie', 'Cilantro', 'Clarence', 'Clark',
       'Clarkus', 'Clarq', 'Claude', 'Cleopatricia', 'Clifford', 'Clybe',
       'Clyde', 'Coco', 'Cody', 'Colby', 'Coleman', 'Colin', 'Combo',
       'Comet', 'Cooper', 'Coops', 'Coopson', 'Cora', 'Corey', 'Covach',
       'Craig', 'Crawford', 'Creg', 'Crimson', 'Crouton', 'Crumpet',
       'Crystal', 'Cuddles', 'Cupcake', 'Cupid', 'Curtis', 'Daisy',
       'Dakota', 'Dale', 'Dallas', 'Damon', 'Daniel', 'Danny', 'Dante',
       'Darby', 'Darla', 'Darrel', 'Dash', 'Dave', 'Davey', 'Dawn',
       'DayZ', 'Deacon', 'Derby', 'Derek', 'Devón', 'Dewey', 'Dex',
       'Dexter', 'Dido', 'Dietrich', 'Diogi', 'Divine', 'Dixie', 'Django',
       'Dobby', 'Doc', 'DonDon', 'Donny', 'Doobert', 'Dook', 'Dot',
       'Dotsy', 'Doug', 'Duchess', 'Duddles', 'Dudley', 'Dug', 'Duke',
       'Dunkin', 'Durg', 'Dutch', 'Dwight', 'Dylan', 'Earl', 'Eazy',
       'Ebby', 'Ed', 'Edd', 'Edgar', 'Edmund', 'Eevee', 'Einstein',
       'Eleanor', 'Eli', 'Ellie', 'Elliot', 'Emanuel', 'Ember', 'Emma',
       'Emmie', 'Emmy', 'Enchilada', 'Erik', 'Eriq', 'Ester', 'Eugene',
       'Eve', 'Evy', 'Fabio', 'Farfle', 'Ferg', 'Fido', 'Fiji', 'Fillup',
       'Filup', 'Finley', 'Finn', 'Finnegus', 'Fiona', 'Fizz', 'Flash',
       'Fletcher', 'Florence', 'Flurpson', 'Flávio', 'Frank', 'Frankie',
       'Franklin', 'Franq', 'Fred', 'Freddery', 'Frönq', 'Furzey', 'Fwed',
       'Fynn', 'Gabby', 'Gabe', 'Gary', 'General', 'Genevieve', 'Geno',
       'Geoff', 'George', 'Georgie', 'Gerald', 'Gerbald', 'Gert',
       'Gidget', 'Gilbert', 'Gin', 'Ginger', 'Gizmo', 'Glacier', 'Glenn',
       'Godi', 'Godzilla', 'Goliath', 'Goose', 'Gordon', 'Grady', 'Grey',
       'Griffin', 'Griswold', 'Grizz', 'Grizzie', 'Grizzwald', 'Gromit',
       'Gunner', 'Gus', 'Gustaf', 'Gustav', 'Gòrdón', 'Hall', 'Halo',
       'Hammond', 'Hamrick', 'Hank', 'Hanz', 'Happy', 'Harlso', 'Harnold',
       'Harold', 'Harper', 'Harrison', 'Harry', 'Harvey', 'Hazel',
       'Hector', 'Heinrich', 'Henry', 'Herald', 'Herb', 'Hercules',
       'Herm', 'Hermione', 'Hero', 'Herschel', 'Hobbes', 'Holly',
       'Horace', 'Howie', 'Hubertson', 'Huck', 'Humphrey', 'Hunter',
       'Hurley', 'Huxley', 'Iggy', 'Ike', 'Indie', 'Iroh', 'Ito', 'Ivar',
       'Izzy', 'JD', 'Jack', 'Jackie', 'Jackson', 'Jameson', 'Jamesy',
       'Jangle', 'Jareld', 'Jarod', 'Jarvis', 'Jaspers', 'Jax', 'Jay',
       'Jaycob', 'Jazz', 'Jazzy', 'Jeb', 'Jebberson', 'Jed', 'Jeffrey',
       'Jeffri', 'Jeffrie', 'Jennifur', 'Jeph', 'Jeremy', 'Jerome',
       'Jerry', 'Jersey', 'Jesse', 'Jessifer', 'Jessiga', 'Jett', 'Jim',
       'Jimbo', 'Jiminus', 'Jiminy', 'Jimison', 'Jimothy', 'Jo',
       'Jockson', 'Joey', 'Jomathan', 'Jonah', 'Jordy', 'Josep', 'Joshwa',
       'Juckson', 'Julio', 'Julius', 'Juno', 'Kaia', 'Kaiya', 'Kallie',
       'Kane', 'Kanu', 'Kara', 'Karl', 'Karll', 'Karma', 'Kathmandu',
       'Katie', 'Kawhi', 'Kayla', 'Keet', 'Keith', 'Kellogg', 'Ken',
       'Kendall', 'Kenneth', 'Kenny', 'Kenzie', 'Keurig', 'Kevin',
       'Kevon', 'Kial', 'Kilo', 'Kingsley', 'Kirby', 'Kirk', 'Klein',
       'Klevin', 'Kloey', 'Kobe', 'Koda', 'Kody', 'Koko', 'Kollin',
       'Kona', 'Kota', 'Kramer', 'Kreg', 'Kreggory', 'Kulet', 'Kuyu',
       'Kyle', 'Kyro', 'Lacy', 'Laela', 'Laika', 'Lambeau', 'Lance',
       'Larry', 'Lassie', 'Layla', 'Leela', 'Lennon', 'Lenny', 'Lenox',
       'Leo', 'Leonard', 'Leonidas', 'Levi', 'Liam', 'Lilah', 'Lili',
       'Lilli', 'Lillie', 'Lilly', 'Lily', 'Lincoln', 'Linda', 'Link',
       'Linus', 'Lipton', 'Livvie', 'Lizzie', 'Logan', 'Loki', 'Lola',
       'Lolo', 'Longfellow', 'Loomis', 'Lorelei', 'Lorenzo', 'Lou',
       'Louie', 'Louis', 'Luca', 'Lucia', 'Lucky', 'Lucy', 'Lugan',
       'Lulu', 'Luna', 'Lupe', 'Luther', 'Mabel', 'Mac', 'Mack', 'Maddie',
       'Maggie', 'Mairi', 'Maisey', 'Major', 'Maks', 'Malcolm', 'Malikai',
       'Margo', 'Mark', 'Marlee', 'Marley', 'Marq', 'Marty', 'Marvin',
       'Mary', 'Mason', 'Mattie', 'Maude', 'Mauve', 'Max', 'Maxaroni',
       'Maximus', 'Maxwell', 'Maya', 'Meatball', 'Meera', 'Meyer', 'Mia',
       'Michelangelope', 'Miguel', 'Mike', 'Miley', 'Milky', 'Millie',
       'Milo', 'Mimosa', 'Mingus', 'Mister', 'Misty', 'Mitch', 'Mo',
       'Moe', 'Mojo', 'Mollie', 'Molly', 'Mona', 'Monkey', 'Monster',
       'Monty', 'Moofasa', 'Mookie', 'Moose', 'Moreton', 'Mosby',
       'Murphy', 'Mutt', 'Mya', 'Nala', 'Naphaniel', 'Napolean', 'Nelly',
       'Neptune', 'Newt', 'Nico', 'Nida', 'Nigel', 'Nimbus', 'Noah',
       'Nollie', 'None', 'Noosh', 'Norman', 'Nugget', 'O', 'Oakley',
       'Obi', 'Obie', 'Oddie', 'Odie', 'Odin', 'Olaf', 'Ole', 'Olive',
       'Oliver', 'Olivia', 'Oliviér', 'Ollie', 'Opal', 'Opie', 'Oreo',
       'Orion', 'Oscar', 'Oshie', 'Otis', 'Ozzie', 'Ozzy', 'Pablo',
       'Paisley', 'Pancake', 'Panda', 'Patch', 'Patrick', 'Paull',
       'Pavlov', 'Pawnd', 'Peaches', 'Peanut', 'Penelope', 'Penny',
       'Pepper', 'Percy', 'Perry', 'Pete', 'Petrick', 'Pherb', 'Phil',
       'Philbert', 'Philippe', 'Phineas', 'Phred', 'Pickles', 'Pilot',
       'Pinot', 'Pip', 'Piper', 'Pippa', 'Pippin', 'Pipsy', 'Pluto',
       'Poppy', 'Pubert', 'Puff', 'Pumpkin', 'Pupcasso', 'Quinn', 'Ralf',
       'Ralph', 'Ralpher', 'Ralphie', 'Ralphson', 'Ralphus', 'Ralphy',
       'Ralphé', 'Rambo', 'Randall', 'Raphael', 'Rascal', 'Raymond',
       'Reagan', 'Reese', 'Reggie', 'Reginald', 'Remington', 'Remus',
       'Remy', 'Reptar', 'Rey', 'Rhino', 'Richie', 'Ricky', 'Ridley',
       'Riley', 'Rilo', 'Rinna', 'River', 'Rizzo', 'Rizzy', 'Robin',
       'Rocco', 'Rocky', 'Rodman', 'Rodney', 'Rolf', 'Romeo', 'Ron',
       'Ronduh', 'Ronnie', 'Rontu', 'Rooney', 'Roosevelt', 'Rorie',
       'Rory', 'Roscoe', 'Rose', 'Rosie', 'Rover', 'Rubio', 'Ruby',
       'Rudy', 'Rueben', 'Ruffles', 'Rufio', 'Rufus', 'Rumble', 'Rumpole',
       'Rupert', 'Rusty', 'Sadie', 'Sage', 'Sailer', 'Sailor', 'Sam',
       'Sammy', 'Sampson', 'Samsom', 'Samson', 'Sandra', 'Sandy', 'Sansa',
       'Sarge', 'Saydee', 'Schnitzel', 'Schnozz', 'Scooter', 'Scott',
       'Scout', 'Scruffers', 'Seamus', 'Sebastian', 'Sephie', 'Severus',
       'Shadoe', 'Shadow', 'Shaggy', 'Shakespeare', 'Shawwn', 'Shelby',
       'Shikha', 'Shiloh', 'Shnuggles', 'Shooter', 'Siba', 'Sid',
       'Sierra', 'Simba', 'Skittle', 'Skittles', 'Sky', 'Skye', 'Smiley',
       'Smokey', 'Snickers', 'Snicku', 'Snoop', 'Snoopy', 'Sobe', 'Socks',
       'Sojourner', 'Solomon', 'Sonny', 'Sophie', 'Sora', 'Spanky',
       'Spark', 'Sparky', 'Spencer', 'Sprinkles', 'Sprout', 'Staniel',
       'Stanley', 'Stark', 'Stefan', 'Stella', 'Stephan', 'Stephanus',
       'Steve', 'Steven', 'Stewie', 'Storkson', 'Stormy', 'Strider',
       'Striker', 'Strudel', 'Stu', 'Stuart', 'Stubert', 'Sugar', 'Suki',
       'Sully', 'Sundance', 'Sunny', 'Sunshine', 'Superpup', 'Swagger',
       'Sweet', 'Sweets', 'Taco', 'Tango', 'Tanner', 'Tassy', 'Tater',
       'Tayzie', 'Taz', 'Tebow', 'Ted', 'Tedders', 'Teddy', 'Tedrick',
       'Terrance', 'Terrenth', 'Terry', 'Tess', 'Tessa', 'Theo',
       'Theodore', 'Thor', 'Thumas', 'Tiger', 'Tilly', 'Timber',
       'Timison', 'Timmy', 'Timofy', 'Tino', 'Titan', 'Tito', 'Tobi',
       'Toby', 'Todo', 'Toffee', 'Tom', 'Tommy', 'Tonks', 'Torque',
       'Tove', 'Travis', 'Traviss', 'Trevith', 'Trigger', 'Trip', 'Tripp',
       'Trooper', 'Tuck', 'Tucker', 'Tuco', 'Tug', 'Tupawc', 'Tycho',
       'Tyr', 'Tyrone', 'Tyrus', 'Ulysses', 'Venti', 'Vince', 'Vincent',
       'Vinnie', 'Vinscent', 'Vixen', 'Wafer', 'Waffles', 'Walker',
       'Wallace', 'Wally', 'Walter', 'Watson', 'Wesley', 'Wiggles',
       'Willem', 'William', 'Willie', 'Willow', 'Willy', 'Wilson',
       'Winifred', 'Winnie', 'Winston', 'Wishes', 'Wyatt', 'Yoda', 'Yogi',
       'Yukon', 'Zara', 'Zeek', 'Zeke', 'Zeus', 'Ziva', 'Zoe', 'Zoey',
       'Zooey', 'Zuzu', 'a', 'actually', 'all', 'an', 'by', 'getting',
       'his', 'incredibly', 'infuriating', 'just', 'life', 'light', 'mad',
       'my', 'not', 'officially', 'old', 'one', 'quite', 'space', 'such',
       'the', 'this', 'unacceptable', 'very'], dtype=object)
In [23]:
#Check how many rating have denominator other than 10
len(twitter_archive.query('rating_denominator != 10'))
Out[23]:
23
In [24]:
#Checking tweets with denominator other than 10
twitter_archive.query('rating_denominator != 10')[['text', 'rating_denominator']]
Out[24]:
text rating_denominator
313 @jonnysun @Lin_Manuel ok jomny I know you're e... 0
342 @docmisterio account started on 11/15/15 15
433 The floofs have been released I repeat the flo... 70
516 Meet Sam. She smiles 24/7 &amp; secretly aspir... 7
784 RT @dog_rates: After so many requests, this is... 11
902 Why does this never happen at my front door...... 150
1068 After so many requests, this is Bretagne. She ... 11
1120 Say hello to this unbelievably well behaved sq... 170
1165 Happy 4/20 from the squad! 13/10 for all https... 20
1202 This is Bluebert. He just saw that both #Final... 50
1228 Happy Saturday here's 9 puppers on a bench. 99... 90
1254 Here's a brigade of puppers. All look very pre... 80
1274 From left to right:\nCletus, Jerome, Alejandro... 50
1351 Here is a whole flock of puppers. 60/50 I'll ... 50
1433 Happy Wednesday here's a bucket of pups. 44/40... 40
1598 Yes I do realize a rating of 4/20 would've bee... 20
1634 Two sneaky puppers were not initially seen, mo... 130
1635 Someone help the girl is being mugged. Several... 110
1662 This is Darrel. He just robbed a 7/11 and is i... 11
1663 I'm aware that I could've said 20/16, but here... 16
1779 IT'S PUPPERGEDDON. Total of 144/120 ...I think... 120
1843 Here we have an entire platoon of puppers. Tot... 80
2335 This is an Albanian 3 1/2 legged Episcopalian... 2
In [25]:
twitter_archive.rating_numerator.min() , twitter_archive.rating_numerator.max()
Out[25]:
(0, 1776)
In [26]:
twitter_archive.rating_numerator.describe()
Out[26]:
count    2356.000000
mean       13.126486
std        45.876648
min         0.000000
25%        10.000000
50%        11.000000
75%        12.000000
max      1776.000000
Name: rating_numerator, dtype: float64
In [27]:
#Checking Outliers of the numerator (any rating greater than 15 or less than 6)
import plotly.express as px
px.box(twitter_archive, y='rating_numerator')
In [28]:
#checking how many outliers are in the numerator column
len(twitter_archive.query('(rating_numerator > 15) | (rating_numerator < 6)' ))
Out[28]:
119
In [29]:
twitter_archive.query('(rating_numerator > 15) | (rating_numerator < 6)')[['text', 'rating_numerator']]
Out[29]:
text rating_numerator
45 This is Bella. She hopes her smile made you sm... 5
55 @roushfenway These are good dogs but 17/10 is ... 17
188 @dhmontgomery We also gave snoop dogg a 420/10... 420
189 @s8n You tried very hard to portray this good ... 666
290 @markhoppus 182/10 182
... ... ...
2334 This is a funny dog. Weird toes. Won't come do... 3
2335 This is an Albanian 3 1/2 legged Episcopalian... 1
2338 Not familiar with this breed. No tail (weird).... 1
2349 This is an odd dog. Hard on the outside but lo... 2
2351 Here we have a 1949 1st generation vulpix. Enj... 5

119 rows × 2 columns

In [30]:
#Checking tweets source values for consistency
twitter_archive.source.value_counts()
Out[30]:
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64
In [31]:
#Checking doggo column values for consistency and understanding the data
twitter_archive['doggo'].value_counts()
Out[31]:
None     2259
doggo      97
Name: doggo, dtype: int64
In [32]:
#Checking floofer column values for consistency and understanding the data
twitter_archive['floofer'].value_counts()
Out[32]:
None       2346
floofer      10
Name: floofer, dtype: int64
In [33]:
#Checking pupper column values for consistency and understanding the data
twitter_archive['pupper'].value_counts()
Out[33]:
None      2099
pupper     257
Name: pupper, dtype: int64
In [34]:
#Checking puppo column values for consistency and understanding the data
twitter_archive['puppo'].value_counts()
Out[34]:
None     2326
puppo      30
Name: puppo, dtype: int64

2 - Image Prediction DataFrame¶

In [35]:
#Getting Image Prediction columns datatypes and null values
image_prediction.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB
In [36]:
#Checking if there's duplicated rows in the table
image_prediction.duplicated().sum()
Out[36]:
0
In [37]:
#Checking duplicated values in tweet_id column 
image_prediction.tweet_id.duplicated().sum()
Out[37]:
0
In [38]:
#Checking values of p1 column
image_prediction.p1.value_counts()
Out[38]:
golden_retriever      150
Labrador_retriever    100
Pembroke               89
Chihuahua              83
pug                    57
                     ... 
pillow                  1
carousel                1
bald_eagle              1
lorikeet                1
orange                  1
Name: p1, Length: 378, dtype: int64
In [39]:
#Getting all values of p1 column
image_prediction.p1.unique()
Out[39]:
array(['Welsh_springer_spaniel', 'redbone', 'German_shepherd',
       'Rhodesian_ridgeback', 'miniature_pinscher',
       'Bernese_mountain_dog', 'box_turtle', 'chow', 'shopping_cart',
       'miniature_poodle', 'golden_retriever', 'Gordon_setter',
       'Walker_hound', 'pug', 'bloodhound', 'Lhasa', 'English_setter',
       'hen', 'desktop_computer', 'Italian_greyhound', 'Maltese_dog',
       'three-toed_sloth', 'ox', 'malamute', 'guinea_pig',
       'soft-coated_wheaten_terrier', 'Chihuahua',
       'black-and-tan_coonhound', 'coho', 'toy_terrier',
       'Blenheim_spaniel', 'Pembroke', 'llama',
       'Chesapeake_Bay_retriever', 'curly-coated_retriever', 'dalmatian',
       'Ibizan_hound', 'Border_collie', 'Labrador_retriever', 'seat_belt',
       'snail', 'miniature_schnauzer', 'Airedale', 'triceratops', 'swab',
       'hay', 'hyena', 'jigsaw_puzzle', 'West_Highland_white_terrier',
       'toy_poodle', 'giant_schnauzer', 'vizsla', 'vacuum', 'Rottweiler',
       'Siberian_husky', 'teddy', 'papillon', 'Saint_Bernard',
       'porcupine', 'goose', 'Tibetan_terrier', 'borzoi', 'beagle',
       'hare', 'Yorkshire_terrier', 'Pomeranian', 'electric_fan',
       'web_site', 'ibex', 'kuvasz', 'fire_engine', 'lorikeet',
       'flat-coated_retriever', 'toyshop', 'common_iguana',
       'Norwegian_elkhound', 'frilled_lizard', 'leatherback_turtle',
       'hamster', 'Angora', 'Arctic_fox', 'trombone', 'canoe',
       'king_penguin', 'shopping_basket', 'standard_poodle',
       'Staffordshire_bullterrier', 'basenji', 'Lakeland_terrier',
       'American_Staffordshire_terrier', 'bearskin', 'Shih-Tzu',
       'bustard', 'crash_helmet', 'French_bulldog', 'Pekinese',
       'komondor', 'ski_mask', 'malinois', 'kelpie', 'Brittany_spaniel',
       'cocker_spaniel', 'shower_curtain', 'basset', 'jellyfish',
       'doormat', 'Arabian_camel', 'lynx', 'hog', 'comic_book', 'minivan',
       'seashore', 'cuirass', 'Brabancon_griffon', 'candle', 'Eskimo_dog',
       'weasel', 'Christmas_stocking', 'washbasin', 'car_mirror',
       'piggy_bank', 'pot', 'boathouse', 'mud_turtle',
       'German_short-haired_pointer', 'Shetland_sheepdog',
       'Irish_terrier', 'cairn', 'platypus', 'English_springer',
       'whippet', 'ping-pong_ball', 'sea_urchin', 'bow_tie',
       'window_shade', "jack-o'-lantern", 'sorrel', 'Sussex_spaniel',
       'peacock', 'axolotl', 'wool', 'banana', 'Dandie_Dinmont',
       'Norwich_terrier', 'wood_rabbit', 'dhole', 'keeshond',
       'Norfolk_terrier', 'lacewing', 'dingo', 'brown_bear',
       'Old_English_sheepdog', 'scorpion', 'flamingo', 'microphone',
       'Samoyed', 'pitcher', 'African_hunting_dog', 'refrigerator',
       'picket_fence', 'tub', 'zebra', 'hermit_crab', 'swing', 'Doberman',
       'park_bench', 'feather_boa', 'Loafer', 'stone_wall', 'ice_bear',
       'prayer_rug', 'chimpanzee', 'china_cabinet', 'bee_eater',
       'tennis_ball', 'carton', 'killer_whale', 'ostrich', 'terrapin',
       'Siamese_cat', 'gondola', 'Great_Pyrenees', 'microwave',
       'starfish', 'sandbar', 'tusker', 'motor_scooter', 'ram',
       'leaf_beetle', 'wombat', 'schipperke', 'Newfoundland',
       'bull_mastiff', 'water_bottle', 'suit', 'toilet_seat', 'collie',
       'robin', 'Cardigan', 'Greater_Swiss_Mountain_dog', 'slug',
       'toilet_tissue', 'acorn_squash', 'soccer_ball',
       'African_crocodile', 'tick', 'ocarina', 'boxer', 'street_sign',
       'bow', 'stove', 'paper_towel', 'upright', 'dough',
       'Scottish_deerhound', 'bath_towel', 'standard_schnauzer',
       'walking_stick', 'Irish_water_spaniel', 'bubble', 'Boston_bull',
       'book_jacket', 'rain_barrel', 'black-footed_ferret', 'guenon',
       'Japanese_spaniel', 'water_buffalo', 'patio', 'cowboy_hat',
       'dogsled', 'maze', 'harp', 'panpipe', 'cash_machine', 'mailbox',
       'wallaby', 'EntleBucher', 'earthstar', 'pillow', 'bluetick',
       'space_heater', 'carousel', 'Irish_setter', 'birdhouse', 'snorkel',
       'bald_eagle', 'koala', 'Leonberg', 'cheetah', 'minibus',
       'Weimaraner', 'clog', 'dishwasher', 'white_wolf', 'sliding_door',
       'damselfly', 'Great_Dane', 'Tibetan_mastiff', 'cheeseburger',
       'fiddler_crab', 'bannister', 'crane', 'Scotch_terrier',
       'snowmobile', 'badger', 'bighorn', 'geyser', 'barrow', 'bison',
       'Mexican_hairless', 'ice_lolly', 'sea_lion', 'dining_table',
       'groenendael', 'Australian_terrier', 'beaver', 'briard',
       'Appenzeller', 'grey_fox', 'mousetrap', 'hippopotamus',
       'Border_terrier', 'hummingbird', 'tailed_frog', 'otter',
       'Egyptian_cat', 'four-poster', 'wild_boar', 'bathtub', 'agama',
       'muzzle', 'hotdog', 'bib', 'espresso', 'timber_wolf', 'meerkat',
       'nail', 'hammer', 'home_theater', 'alp', 'bonnet', 'handkerchief',
       'hand_blower', 'polecat', 'lakeside', 'studio_couch', 'cup',
       'cliff', 'Bedlington_terrier', 'lawn_mower', 'balloon',
       'sunglasses', 'rapeseed', 'traffic_light', 'coil', 'binoculars',
       'paddle', 'tiger_shark', 'sulphur-crested_cockatoo',
       'wire-haired_fox_terrier', 'Saluki', 'American_black_bear',
       'rotisserie', 'conch', 'skunk', 'bookshop', 'radio_telescope',
       'cougar', 'African_grey', 'coral_reef', 'lion', 'maillot',
       'Madagascar_cat', 'tabby', 'silky_terrier', 'giant_panda',
       'long-horned_beetle', 'Afghan_hound', 'clumber', 'sundial',
       'padlock', 'pool_table', 'quilt', 'beach_wagon', 'remote_control',
       'bakery', 'pedestal', 'gas_pump', 'bookcase', 'shield', 'loupe',
       'restaurant', 'prison', 'school_bus', 'cowboy_boot', 'jersey',
       'wooden_spoon', 'leopard', 'mortarboard', 'teapot',
       'military_uniform', 'washer', 'coffee_mug', 'fountain',
       'pencil_box', 'barbell', 'grille', 'revolver', 'envelope',
       'syringe', 'marmot', 'pole', 'laptop', 'basketball', 'tricycle',
       'convertible', 'limousine', 'orange'], dtype=object)
In [40]:
#Checking some odd data 
image_prediction.query('p1 == "shopping_cart"').jpg_url
Out[40]:
8       https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg
703     https://pbs.twimg.com/media/CYFOP6cWEAAWp-k.jpg
1432    https://pbs.twimg.com/media/CrtYRMEWIAAUkCl.jpg
1760    https://pbs.twimg.com/media/C3YaSnQWAAILgz0.jpg
1834    https://pbs.twimg.com/media/C52V7PzWcAA_pVv.jpg
Name: jpg_url, dtype: object

In [41]:
#Checking values of p2 column
image_prediction.p2.value_counts()
Out[41]:
Labrador_retriever    104
golden_retriever       92
Cardigan               73
Chihuahua              44
Pomeranian             42
                     ... 
medicine_chest          1
quail                   1
horse_cart              1
waffle_iron             1
bagel                   1
Name: p2, Length: 405, dtype: int64
In [42]:
#Getting all values of p2 column
image_prediction.p2.unique()
Out[42]:
array(['collie', 'miniature_pinscher', 'malinois', 'redbone',
       'Rottweiler', 'English_springer', 'mud_turtle', 'Tibetan_mastiff',
       'shopping_basket', 'komondor', 'Yorkshire_terrier',
       'English_foxhound', 'bull_mastiff', 'German_shepherd', 'Shih-Tzu',
       'Newfoundland', 'cock', 'desk', 'toy_terrier', 'toy_poodle',
       'otter', 'Chesapeake_Bay_retriever', 'Siberian_husky', 'skunk',
       'Afghan_hound', 'bloodhound', 'barracouta', 'papillon',
       'cocker_spaniel', 'chow', 'Irish_terrier', 'chain_saw', 'beagle',
       'giant_schnauzer', 'Labrador_retriever', 'Pembroke', 'Chihuahua',
       'Weimaraner', 'slug', 'Brittany_spaniel', 'standard_schnauzer',
       'teddy', 'armadillo', 'African_hunting_dog', 'vizsla', 'doormat',
       'pug', 'Italian_greyhound', 'Samoyed', 'Pomeranian',
       'miniature_poodle', 'Lakeland_terrier', 'Irish_setter', 'swab',
       'malamute', 'bath_towel', 'Border_collie', 'Leonberg', 'drake',
       'French_bulldog', 'ice_bear', 'Christmas_stocking',
       'golden_retriever', 'standard_poodle', 'dhole', 'kuvasz',
       'Cardigan', 'silky_terrier', 'spotlight', 'dishwasher', 'bighorn',
       'tow_truck', 'hummingbird', 'English_setter', 'prayer_rug',
       'frilled_lizard', 'Pekinese', 'ox', 'boxer', 'hog', 'guinea_pig',
       'hen', 'wallaby', 'cowboy_boot', 'cornet', 'minivan', 'paddle',
       'basset', 'hamper', 'Bedlington_terrier', 'Shetland_sheepdog',
       'bow', 'Lhasa', 'pelican', 'toaster', 'groenendael',
       'Australian_terrier', 'llama', 'knee_pad', 'pillow',
       'Ibizan_hound', 'Old_English_sheepdog', 'Welsh_springer_spaniel',
       'coral_reef', 'bison', 'waffle_iron', 'tabby', 'bib', 'police_van',
       'Eskimo_dog', 'breastplate', 'German_short-haired_pointer',
       'Norfolk_terrier', 'Blenheim_spaniel', 'pickup',
       'miniature_schnauzer', 'lampshade', 'Tibetan_terrier',
       'Siamese_cat', 'borzoi', 'studio_couch', 'toilet_seat', 'hamster',
       'seat_belt', 'keeshond', 'koala', 'hair_spray', 'Saint_Bernard',
       'tray', 'American_Staffordshire_terrier', 'birdhouse', 'terrapin',
       'Staffordshire_bullterrier', 'briard',
       'West_Highland_white_terrier', 'spotted_salamander', 'tennis_ball',
       'porcupine', 'cardigan', 'corn', 'basenji', 'otterhound',
       'European_gallinule', 'indri', 'tailed_frog', 'beach_wagon',
       'siamang', 'orange', 'home_theater', 'cairn', 'hare',
       'Norwegian_elkhound', 'Brabancon_griffon', 'American_black_bear',
       'sulphur_butterfly', 'Sealyham_terrier', 'Walker_hound',
       'tarantula', 'Persian_cat', 'coral_fungus', 'accordion', 'kelpie',
       'Great_Pyrenees', 'wood_rabbit', 'black-and-tan_coonhound',
       'sunglasses', 'plow', 'whippet', 'rain_barrel', 'bathtub', 'tiger',
       'snail', 'tick', 'wire-haired_fox_terrier', 'water_bottle', 'wig',
       'platypus', 'Irish_wolfhound', 'ram', 'gorilla',
       'entertainment_center', 'toucan', 'mask', 'shopping_cart',
       'Sussex_spaniel', 'crate', 'grey_whale', 'badger', 'Airedale',
       'Arabian_camel', 'cockroach', 'lifeboat', 'rotisserie', 'goldfish',
       'stingray', 'warthog', 'bobsled', 'rhinoceros_beetle', 'beaver',
       'brown_bear', 'Maltese_dog', 'weasel', 'quill',
       'Rhodesian_ridgeback', 'Arctic_fox', 'ashcan', 'bow_tie',
       'soft-coated_wheaten_terrier', 'schipperke', 'bearskin',
       'Kerry_blue_terrier', 'ice_lolly', 'American_alligator',
       'mosquito_net', 'sea_lion', 'Boston_bull', 'nail',
       'black-footed_ferret', 'promontory', 'sarong', 'Doberman',
       'space_heater', 'Great_Dane', 'mailbox', 'Saluki', 'bakery',
       'sandal', 'leafhopper', 'barrel', 'water_buffalo', 'polecat',
       'macaque', 'Japanese_spaniel', 'folding_chair', 'trench_coat',
       'Angora', 'junco', 'crib', 'dalmatian', 'snowmobile',
       'flat-coated_retriever', 'streetcar', 'window_screen', 'bannister',
       'hair_slide', 'meerkat', 'cannon', 'jaguar', 'Border_terrier',
       'Greater_Swiss_Mountain_dog', 'hay', 'apron', 'cloak', 'radiator',
       'muzzle', 'feather_boa', 'rifle', 'chimpanzee', 'loggerhead',
       'torch', 'Mexican_hairless', 'spindle', 'triceratops',
       'Appenzeller', 'stove', 'dingo', 'oscilloscope', 'common_newt',
       'hotdog', 'medicine_chest', 'quail', 'horse_cart', 'four-poster',
       'pier', 'red_fox', 'affenpinscher', 'assault_rifle',
       'mashed_potato', 'moped', 'hyena', 'seashore', 'tub', 'sports_car',
       'swing', 'mink', 'bluetick', 'neck_brace', 'grey_fox', 'mongoose',
       'fur_coat', 'Scotch_terrier', 'spatula', 'paper_towel', 'shoji',
       'toyshop', 'banded_gecko', 'peacock', 'crossword_puzzle',
       'tree_frog', 'wombat', 'turnstile', 'sleeping_bag', 'quilt',
       'Gila_monster', 'giant_panda', 'Dandie_Dinmont', 'handkerchief',
       'sombrero', 'Indian_elephant', 'coffee_mug', 'gibbon', 'carton',
       'screw', 'minibus', 'hatchet', 'window_shade', 'lawn_mower',
       'washbasin', 'sock', 'prison', 'patio', 'china_cabinet',
       'chain_mail', 'breakwater', 'computer_keyboard', 'goose',
       'lakeside', 'solar_dish', 'table_lamp', 'Windsor_tie',
       'punching_bag', 'comic_book', 'sunglass', 'great_white_shark',
       'timber_wolf', 'fountain', 'dugong', 'marmot', 'barbershop',
       'shovel', 'curly-coated_retriever', 'lesser_panda', 'monitor',
       'crutch', 'cash_machine', 'printer', 'volcano', 'wallet', 'laptop',
       'bathing_cap', 'confectionery', 'dam', 'killer_whale', 'canoe',
       'Madagascar_cat', 'jean', 'boathouse', 'cliff', 'maillot', 'iPod',
       'hand-held_computer', 'black_widow', 'Norwich_terrier', 'necklace',
       'dining_table', 'binoculars', 'cradle', 'sea_urchin', 'cougar',
       'EntleBucher', 'basketball', 'lighter', 'saltshaker', 'harvester',
       'television', 'envelope', 'house_finch', 'web_site', 'palace',
       'shower_curtain', 'cab', 'snorkel', 'jigsaw_puzzle', 'sweatshirt',
       'white_wolf', 'sliding_door', 'academic_gown', 'cowboy_hat',
       'can_opener', 'cup', 'rule', 'soccer_ball', 'bucket', 'racket',
       'menu', 'purse', 'Bernese_mountain_dog', 'dumbbell', 'projectile',
       'dock', 'oxygen_mask', 'sandbar', 'umbrella', 'shower_cap',
       'bagel'], dtype=object)
In [43]:
#Checking some odd data 
image_prediction.query('p2 == "envelope"').jpg_url
Out[43]:
1626    https://pbs.twimg.com/media/Cyh5mQTW8AQpB6K.jpg
1696    https://pbs.twimg.com/media/C1SddosXUAQcVR1.jpg
1975    https://pbs.twimg.com/media/DBMV3NnXUAAm0Pp.jpg
Name: jpg_url, dtype: object

In [44]:
image_prediction.p3.value_counts()
Out[44]:
Labrador_retriever    79
Chihuahua             58
golden_retriever      48
Eskimo_dog            38
kelpie                35
                      ..
ox                     1
assault_rifle          1
axolotl                1
pot                    1
banana                 1
Name: p3, Length: 408, dtype: int64
In [45]:
#Getting all values of p3 column
image_prediction.p3.unique()
Out[45]:
array(['Shetland_sheepdog', 'Rhodesian_ridgeback', 'bloodhound',
       'miniature_pinscher', 'Doberman', 'Greater_Swiss_Mountain_dog',
       'terrapin', 'fur_coat', 'golden_retriever',
       'soft-coated_wheaten_terrier', 'Labrador_retriever', 'Pekinese',
       'Ibizan_hound', 'French_bulldog', 'malinois', 'Dandie_Dinmont',
       'borzoi', 'partridge', 'bookcase', 'basenji', 'miniature_poodle',
       'great_grey_owl', 'groenendael', 'Eskimo_dog', 'hamster', 'briard',
       'papillon', 'flat-coated_retriever', 'gar', 'Chihuahua',
       'Shih-Tzu', 'Pomeranian', 'dingo', 'power_drill', 'Saluki',
       'Great_Pyrenees', 'West_Highland_white_terrier', 'collie',
       'toy_poodle', 'vizsla', 'acorn', 'giant_schnauzer', 'teddy',
       'common_iguana', 'wig', 'water_buffalo', 'coyote', 'seat_belt',
       'kelpie', 'space_heater', 'Brabancon_griffon', 'standard_poodle',
       'beagle', 'Irish_water_spaniel', 'bluetick', 'Weimaraner',
       'Chesapeake_Bay_retriever', 'toilet_tissue',
       'black-and-tan_coonhound', 'kuvasz', 'Christmas_stocking',
       'badger', 'hen', 'Staffordshire_bullterrier', 'Yorkshire_terrier',
       'Lakeland_terrier', 'weasel', 'ski_mask', 'cocker_spaniel',
       'Australian_terrier', 'lampshade', 'oscilloscope', 'ram', 'jeep',
       'ice_bear', 'African_grey', 'Great_Dane', 'curly-coated_retriever',
       'doormat', 'African_chameleon', 'schipperke', 'muzzle',
       'triceratops', 'Newfoundland', 'Band_Aid', 'wood_rabbit',
       'white_wolf', 'giant_panda', 'Welsh_springer_spaniel',
       'French_horn', 'toy_terrier', 'Pembroke', 'Cardigan', 'bassinet',
       'pug', 'Afghan_hound', 'American_Staffordshire_terrier', 'whippet',
       'English_setter', 'panpipe', 'crane', 'mouse', 'titi', 'Angora',
       'Boston_bull', 'silky_terrier', 'Japanese_spaniel', 'sandbar',
       'balance_beam', 'black-footed_ferret', 'miniature_schnauzer',
       'Blenheim_spaniel', 'bathtub', 'Saint_Bernard', 'redbone',
       'goldfish', 'Norfolk_terrier', 'llama', 'koala', 'pillow',
       'jersey', 'chow', 'minibus', 'malamute', 'bulletproof_vest',
       'beach_wagon', 'cairn', 'plunger', 'paper_towel', 'wing',
       'English_foxhound', 'Brittany_spaniel', 'bolete', 'ashcan',
       'box_turtle', 'guinea_pig', 'bison', 'bull_mastiff', 'racket',
       'cardoon', 'Tibetan_mastiff', 'window_screen', 'Irish_terrier',
       'agama', 'common_newt', 'car_wheel', 'gorilla', 'bagel', 'clumber',
       'Egyptian_cat', 'television', 'boxer', 'brown_bear', 'leafhopper',
       'German_shepherd', 'Border_collie', 'menu', 'wolf_spider',
       'bathing_cap', 'stinkhorn', 'drumstick', 'mask',
       'Scottish_deerhound', 'shower_curtain', 'Appenzeller',
       'plastic_bag', 'swimming_trunks', 'prairie_chicken', 'red_wolf',
       'Maltese_dog', 'snail', 'gibbon', 'Gordon_setter', 'black_swan',
       'beacon', 'wool', 'cowboy_boot', 'Rottweiler', 'poncho', 'swing',
       'Arctic_fox', 'bib', 'Italian_greyhound', 'steam_locomotive',
       'fountain', 'chickadee', 'abaya', 'Border_terrier', 'bubble',
       'chimpanzee', 'hammerhead', 'Norwegian_elkhound',
       'Norwich_terrier', 'Airedale', 'Siamese_cat', 'sea_cucumber',
       'seashore', 'nipple', 'moped', 'Arabian_camel', 'crayfish',
       'wallaby', 'wire-haired_fox_terrier', 'toilet_seat',
       'Old_English_sheepdog', 'pajama', 'Walker_hound', 'shovel',
       'bucket', 'Sealyham_terrier', 'Windsor_tie', 'Siberian_husky',
       'quill', 'Persian_cat', 'European_fire_salamander',
       'three-toed_sloth', 'swab', 'echidna', 'tennis_ball', 'Lhasa',
       'coral_reef', 'keeshond', 'mink', 'screw', 'basset', 'wreck',
       'kimono', 'German_short-haired_pointer', 'joystick', 'microwave',
       'Tibetan_terrier', 'Irish_wolfhound', 'Samoyed', 'loggerhead',
       'French_loaf', 'Irish_setter', 'komondor', 'purse', 'greenhouse',
       'broccoli', 'shopping_basket', 'macaque', 'squirrel_monkey',
       'green_lizard', 'parallel_bars', 'cloak', 'chest', 'sundial',
       'mosquito_net', 'bath_towel', 'cuirass', 'zebra', 'lumbermill',
       'wallet', 'feather_boa', 'English_springer', 'electric_fan',
       'hippopotamus', 'ox', 'quilt', 'assault_rifle', 'axolotl', 'pot',
       'toyshop', 'pizza', 'scuba_diver', 'beaver', 'Mexican_hairless',
       'cliff', 'loupe', 'wild_boar', 'jaguar', 'hog', 'polecat', 'lion',
       'EntleBucher', 'hand-held_computer', 'washbasin', 'whiptail',
       'rock_crab', 'hare', 'shoji', 'sombrero', 'bell_cote', 'rifle',
       'goose', 'pickup', 'sunglasses', 'limousine', 'bow_tie', 'pretzel',
       'marmot', 'ice_lolly', 'vacuum', 'dalmatian', 'prison',
       'shower_cap', 'sliding_door', 'dugong', 'otterhound', 'eel',
       'binder', 'bullfrog', 'soap_dispenser', 'sea_lion', 'carton',
       'brass', 'mitten', 'golfcart', 'cougar', 'warthog', 'umbrella',
       'neck_brace', 'cup', 'book_jacket', 'padlock', 'cab', 'chime',
       'Leonberg', 'viaduct', 'American_black_bear', 'tub', 'hand_blower',
       'king_penguin', 'rotisserie', 'bannister', 'passenger_car',
       'mongoose', 'dhole', 'consomme', 'valley', 'park_bench',
       'mushroom', 'barrow', 'parachute', 'desktop_computer', 'snorkel',
       'wok', 'affenpinscher', 'space_shuttle', 'rain_barrel',
       'ballplayer', 'mountain_tent', 'oxcart', 'buckeye', 'sunglass',
       'croquet_ball', 'refrigerator', 'snow_leopard', 'tripod',
       'rapeseed', 'tiger_cat', 'Bernese_mountain_dog', 'notebook',
       'maraca', 'pool_table', 'lakeside', 'theater_curtain', 'pier',
       'cheetah', 'mousetrap', 'pop_bottle', 'soccer_ball', 'wombat',
       'rhinoceros_beetle', 'paddlewheel', 'paintbrush', 'maze',
       'hatchet', 'chain', 'jigsaw_puzzle', 'switch',
       'Kerry_blue_terrier', 'barbell', 'convertible',
       'entertainment_center', 'file', 'guillotine', 'nail',
       'standard_schnauzer', 'bow', 'grocery_store', 'boathouse', 'conch',
       'Bouvier_des_Flandres', 'grey_fox', 'shopping_cart', 'meerkat',
       'grand_piano', 'envelope', 'screen', 'coffeepot', 'printer',
       'otter', 'restaurant', 'bonnet', 'crossword_puzzle', 'go-kart',
       'Sussex_spaniel', 'orangutan', 'canoe', 'barber_chair',
       'traffic_light', 'ibex', 'can_opener', 'Indian_elephant',
       'spatula', 'banana'], dtype=object)
In [46]:
image_prediction.query('p3 == "Band_Aid"').jpg_url
Out[46]:
115    https://pbs.twimg.com/media/CUT9PuQWwAABQv7.jpg
Name: jpg_url, dtype: object

In [47]:
#Check if there's repeated urls 
image_prediction['jpg_url'].value_counts()
Out[47]:
https://pbs.twimg.com/media/CZhn-QAWwAASQan.jpg                                            2
https://pbs.twimg.com/media/Cq9guJ5WgAADfpF.jpg                                            2
https://pbs.twimg.com/ext_tw_video_thumb/807106774843039744/pu/img/8XZg1xW35Xp2J6JW.jpg    2
https://pbs.twimg.com/media/CU1zsMSUAAAS0qW.jpg                                            2
https://pbs.twimg.com/media/CsrjryzWgAAZY00.jpg                                            2
                                                                                          ..
https://pbs.twimg.com/media/CXrmMSpUwAAdeRj.jpg                                            1
https://pbs.twimg.com/media/CXrawAhWkAAWSxC.jpg                                            1
https://pbs.twimg.com/media/CXrIntsUsAEkv0d.jpg                                            1
https://pbs.twimg.com/media/CXqcOHCUQAAugTB.jpg                                            1
https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg                                            1
Name: jpg_url, Length: 2009, dtype: int64
In [48]:
#Count number of duplicated urls
image_prediction.jpg_url.duplicated().sum()
Out[48]:
66
In [49]:
len(image_prediction.query('p1_dog == False'))
Out[49]:
543

Quality issues:¶

Completness, Accuracy, Consistency, and Validity

1.Twitter Archive¶

I - tweet_id column in wrong data type -> convert into string

II - in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, and retweeted_status_timestamp have few entities so I'll drop them because they are not representitve of the whole dataframe

III - timestamp is string data type -> convert into datetime

IV - source coulmn has HTML anchor tag -> remove anchor tag from the column

V - rating_denominator coulmn has 23 entries where it's not equal to 10 -> Drop invalid numbers

VI - rating_numerator has 119 values that are considered outliers -> Drop outliers

VII - expanded_urls column has repeated values -> Drop duplicated values

VIII - change values in 'name', 'doggo', 'floofer', 'pupper', 'puppo' columns from none into nan

IX - Remove retweets from the data frame

  • Some values in name is not right('a', 'actually', 'all', 'an', 'by', 'getting','his', 'incredibly', 'infuriating', 'just', 'life', 'light', 'mad', 'my', 'not', 'officially', 'old', 'one', 'quite', 'space', 'such', 'the', 'this', 'unacceptable', 'very')

2.image_prediction¶

I - tweet_id column in wrong data type -> convert into string

II - jpg_url has duplicated values -> drop duplicated values

Tideness¶

I - merge name, doggo, floofer, pupper, puppo into one column

II - merge all tables into one table

In [94]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
image_prediction = pd.read_csv('image_prediction.tsv', sep='\t')
json_tweets = pd.read_csv('json_tweets.csv')

Cleaning Code¶

In [95]:
#Make a copy of all data to wrangle and maintain orginal state of data also for reference
df_archive = twitter_archive.copy()
df_image = image_prediction.copy()
df_json = json_tweets.copy()

Define ¶

Remove retweets

Code¶

In [96]:
df_archive = df_archive[np.isnan(df_archive.retweeted_status_id)]

Test¶

In [97]:
df_archive.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2175 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2175 non-null   object 
 4   source                      2175 non-null   object 
 5   text                        2175 non-null   object 
 6   retweeted_status_id         0 non-null      float64
 7   retweeted_status_user_id    0 non-null      float64
 8   retweeted_status_timestamp  0 non-null      object 
 9   expanded_urls               2117 non-null   object 
 10  rating_numerator            2175 non-null   int64  
 11  rating_denominator          2175 non-null   int64  
 12  name                        2175 non-null   object 
 13  doggo                       2175 non-null   object 
 14  floofer                     2175 non-null   object 
 15  pupper                      2175 non-null   object 
 16  puppo                       2175 non-null   object 
dtypes: float64(4), int64(3), object(10)
memory usage: 305.9+ KB

Define¶

I - Wrong datatype of tweet_id (int) change it into (String)

Code¶

In [98]:
df_archive.tweet_id = df_archive.tweet_id.astype(str)

Test¶

In [99]:
df_archive.dtypes
Out[99]:
tweet_id                       object
in_reply_to_status_id         float64
in_reply_to_user_id           float64
timestamp                      object
source                         object
text                           object
retweeted_status_id           float64
retweeted_status_user_id      float64
retweeted_status_timestamp     object
expanded_urls                  object
rating_numerator                int64
rating_denominator              int64
name                           object
doggo                          object
floofer                        object
pupper                         object
puppo                          object
dtype: object

Define

II - Drop irrelevant Columns

Code¶

In [100]:
droped_columns = ['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp']
df_archive.drop(droped_columns, axis = 1, inplace = True)

Test¶

In [101]:
df_archive.columns
Out[101]:
Index(['tweet_id', 'timestamp', 'source', 'text', 'expanded_urls',
       'rating_numerator', 'rating_denominator', 'name', 'doggo', 'floofer',
       'pupper', 'puppo'],
      dtype='object')

Define

III - change datatype of timestamp from string into datetime

Code¶

In [102]:
format = "%Y-%m-%d %H:%M:%S"
df_archive.timestamp = pd.to_datetime(df_archive.timestamp, format = format)

Test¶

In [103]:
df_archive.dtypes
Out[103]:
tweet_id                           object
timestamp             datetime64[ns, UTC]
source                             object
text                               object
expanded_urls                      object
rating_numerator                    int64
rating_denominator                  int64
name                               object
doggo                              object
floofer                            object
pupper                             object
puppo                              object
dtype: object
In [104]:
df_archive.source.value_counts()
Out[104]:
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2042
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       31
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64
In [105]:
iphone = '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>'
vine = '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>'
web = '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>'
tweetdeck = '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>'
iphone_n = 'Twitter for iPhone'
vine_n = 'Vine - Make a Scene'
web_n = 'Twitter Web Client'
tweetdeck_n = 'TweetDeck'

Define

IV - remove anchor tag from the source column

Code¶

In [106]:
df_archive['source'] = df_archive['source'].str.replace(iphone, iphone_n, regex = True).replace(vine, vine_n).replace(web, web_n).replace(tweetdeck, tweetdeck_n)

Test¶

In [107]:
df_archive['source'].value_counts()
Out[107]:
Twitter for iPhone     2042
Vine - Make a Scene      91
Twitter Web Client       31
TweetDeck                11
Name: source, dtype: int64
In [108]:
print('Number of rows where rating_denominator is not equal to 10 is: {}'.format(len(df_archive.query('rating_denominator != 10')))) 
print('Number of rows where rating_denominator is equal to 10 is: {}'.format(len(df_archive.query('rating_denominator == 10'))))
Number of rows where rating_denominator is not equal to 10 is: 22
Number of rows where rating_denominator is equal to 10 is: 2153

Define

V - rating_denominator coulmn has 23 entries where it's not equal to 10 -> Drop invalid numbers

Code¶

In [109]:
df_archive = df_archive[df_archive.rating_denominator == 10]

Test¶

In [110]:
df_archive.rating_denominator.value_counts()
Out[110]:
10    2153
Name: rating_denominator, dtype: int64

Define

VI - Drop outliers of rating_numerator column

Code¶

In [111]:
twitter_archive = twitter_archive.query('(rating_numerator <= 15) & (rating_numerator >= 6)')

Test¶

In [112]:
twitter_archive.rating_numerator.value_counts()
Out[112]:
12    558
11    464
10    461
13    351
9     158
8     102
7      55
14     54
6      32
15      2
Name: rating_numerator, dtype: int64

Define

VII - Drop duplicated values and NaN
In [113]:
df_archive.expanded_urls.value_counts()
Out[113]:
https://vine.co/v/ea0OwvPTx9l                                                                                                                                                                      2
https://twitter.com/dog_rates/status/892420643555336193/photo/1                                                                                                                                    1
https://twitter.com/dog_rates/status/684481074559381504/photo/1                                                                                                                                    1
https://twitter.com/dog_rates/status/683834909291606017/video/1                                                                                                                                    1
https://twitter.com/dog_rates/status/683849932751646720/photo/1                                                                                                                                    1
                                                                                                                                                                                                  ..
https://twitter.com/dog_rates/status/759047813560868866/photo/1,https://twitter.com/dog_rates/status/759047813560868866/photo/1                                                                    1
https://twitter.com/dog_rates/status/759099523532779520/photo/1                                                                                                                                    1
https://twitter.com/dog_rates/status/759197388317847553/photo/1,https://twitter.com/dog_rates/status/759197388317847553/photo/1,https://twitter.com/dog_rates/status/759197388317847553/photo/1    1
https://twitter.com/wsaznews/status/759167558763196416                                                                                                                                             1
https://twitter.com/dog_rates/status/666020888022790149/photo/1                                                                                                                                    1
Name: expanded_urls, Length: 2098, dtype: int64
In [114]:
df_archive.query('expanded_urls.isna() == True')
Out[114]:
tweet_id timestamp source text expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
30 886267009285017600 2017-07-15 16:51:35+00:00 Twitter for iPhone @NonWhiteHat @MayhewMayhem omg hello tanner yo... NaN 12 10 None None None None None
55 881633300179243008 2017-07-02 21:58:53+00:00 Twitter for iPhone @roushfenway These are good dogs but 17/10 is ... NaN 17 10 None None None None None
64 879674319642796034 2017-06-27 12:14:36+00:00 Twitter for iPhone @RealKentMurphy 14/10 confirmed NaN 14 10 None None None None None
113 870726314365509632 2017-06-02 19:38:25+00:00 Twitter for iPhone @ComplicitOwl @ShopWeRateDogs &gt;10/10 is res... NaN 10 10 None None None None None
148 863427515083354112 2017-05-13 16:15:35+00:00 Twitter for iPhone @Jack_Septic_Eye I'd need a few more pics to p... NaN 12 10 None None None None None
179 857214891891077121 2017-04-26 12:48:51+00:00 Twitter for iPhone @Marc_IRL pixelated af 12/10 NaN 12 10 None None None None None
186 856288084350160898 2017-04-23 23:26:03+00:00 Twitter for iPhone @xianmcguire @Jenna_Marbles Kardashians wouldn... NaN 14 10 None None None None None
188 855862651834028034 2017-04-22 19:15:32+00:00 Twitter for iPhone @dhmontgomery We also gave snoop dogg a 420/10... NaN 420 10 None None None None None
189 855860136149123072 2017-04-22 19:05:32+00:00 Twitter for iPhone @s8n You tried very hard to portray this good ... NaN 666 10 None None None None None
218 850333567704068097 2017-04-07 13:04:55+00:00 Twitter for iPhone @markhoppus MARK THAT DOG HAS SEEN AND EXPERIE... NaN 13 10 None None None None None
228 848213670039564288 2017-04-01 16:41:12+00:00 Twitter for iPhone Jerry just apuppologized to me. He said there ... NaN 11 10 None None None None None
234 847617282490613760 2017-03-31 01:11:22+00:00 Twitter for iPhone .@breaannanicolee PUPDATE: Cannon has a heart ... NaN 13 10 None None None None None
274 840698636975636481 2017-03-11 22:59:09+00:00 Twitter for iPhone @0_kelvin_0 &gt;10/10 is reserved for puppos s... NaN 10 10 None None None None None
290 838150277551247360 2017-03-04 22:12:52+00:00 Twitter for iPhone @markhoppus 182/10 NaN 182 10 None None None None None
291 838085839343206401 2017-03-04 17:56:49+00:00 Twitter for iPhone @bragg6of8 @Andy_Pace_ we are still looking fo... NaN 15 10 None None None None None
346 831926988323639298 2017-02-15 18:03:45+00:00 Twitter for iPhone @UNC can confirm 12/10 NaN 12 10 None None None None None
375 828361771580813312 2017-02-05 21:56:51+00:00 Twitter Web Client Beebop and Doobert should start a band 12/10 w... NaN 12 10 None None None None None
387 826598799820865537 2017-02-01 01:11:25+00:00 Twitter for iPhone I was going to do 007/10, but the joke wasn't ... NaN 7 10 None None None None None
409 823333489516937216 2017-01-23 00:56:15+00:00 Twitter for iPhone @HistoryInPics 13/10 NaN 13 10 None None None None None
427 821153421864615936 2017-01-17 00:33:26+00:00 Twitter for iPhone @imgur for a polar bear tho I'd say 13/10 is a... NaN 13 10 None None None None None
498 813130366689148928 2016-12-25 21:12:41+00:00 Twitter for iPhone I've been informed by multiple sources that th... NaN 12 10 None None None None None
513 811647686436880384 2016-12-21 19:01:02+00:00 Twitter for iPhone PUPDATE: I've been informed that Augie was act... NaN 11 10 None None None None None
570 801854953262350336 2016-11-24 18:28:13+00:00 Twitter for iPhone .@NBCSports OMG THE TINY HAT I'M GOING TO HAVE... NaN 11 10 None None None None None
576 800859414831898624 2016-11-22 00:32:18+00:00 Twitter for iPhone @SkyWilliams doggo simply protecting you from ... NaN 11 10 None doggo None None None
611 797165961484890113 2016-11-11 19:55:50+00:00 Twitter for iPhone @JODYHiGHROLLER it may be an 11/10 but what do... NaN 11 10 None None None None None
701 786051337297522688 2016-10-12 03:50:17+00:00 Twitter for iPhone 13/10 for breakdancing puppo @shibbnbot NaN 13 10 None None None None puppo
707 785515384317313025 2016-10-10 16:20:36+00:00 Twitter for iPhone Today, 10/10, should be National Dog Rates Day NaN 10 10 None None None None None
843 766714921925144576 2016-08-19 19:14:16+00:00 Twitter for iPhone His name is Charley and he already has a new s... NaN 13 10 None None None None None
857 763956972077010945 2016-08-12 04:35:10+00:00 Twitter for iPhone @TheEllenShow I'm not sure if you know this bu... NaN 12 10 None doggo None None None
967 750381685133418496 2016-07-05 17:31:49+00:00 Twitter for iPhone 13/10 such a good doggo\n@spaghemily NaN 13 10 None doggo None None None
1005 747651430853525504 2016-06-28 04:42:46+00:00 Twitter for iPhone Other pupper asked not to have his identity sh... NaN 12 10 None None None pupper None
1080 738891149612572673 2016-06-04 00:32:32+00:00 Twitter for iPhone @mount_alex3 13/10 NaN 13 10 None None None None None
1295 707983188426153984 2016-03-10 17:35:20+00:00 Twitter for iPhone @serial @MrRoles OH MY GOD I listened to all o... NaN 12 10 None None None None None
1345 704491224099647488 2016-03-01 02:19:31+00:00 Twitter for iPhone 13/10 hero af\n@ABC NaN 13 10 None None None None None
1445 696518437233913856 2016-02-08 02:18:30+00:00 Twitter for iPhone Oh my god 10/10 for every little hot dog pupper NaN 10 10 None None None pupper None
1446 696490539101908992 2016-02-08 00:27:39+00:00 Twitter for iPhone After reading the comments I may have overesti... NaN 1 10 None None None None None
1474 693644216740769793 2016-01-31 03:57:23+00:00 Twitter for iPhone BREAKING PUPDATE: I've just been notified that... NaN 10 10 None None None None None
1479 693582294167244802 2016-01-30 23:51:19+00:00 Twitter for iPhone Personally I'd give him an 11/10. Not sure why... NaN 11 10 None None None None None
1497 692423280028966913 2016-01-27 19:05:49+00:00 Twitter for iPhone PUPDATE: just noticed this dog has some extra ... NaN 9 10 None None None None None
1523 690607260360429569 2016-01-22 18:49:36+00:00 Twitter for iPhone 12/10 @LightningHoltt NaN 12 10 None None None None None
1605 685681090388975616 2016-01-09 04:34:45+00:00 Twitter for iPhone Jack deserves another round of applause. If yo... NaN 14 10 None None None None None
1618 684969860808454144 2016-01-07 05:28:35+00:00 Twitter for iPhone For those who claim this is a goat, u are wron... NaN 5 10 None None None None None
1689 681340665377193984 2015-12-28 05:07:27+00:00 Twitter for iPhone I've been told there's a slight possibility he... NaN 5 10 None None None None None
1774 678023323247357953 2015-12-19 01:25:31+00:00 Twitter for iPhone After getting lost in Reese's eyes for several... NaN 13 10 None None None None None
1819 676590572941893632 2015-12-15 02:32:17+00:00 Twitter for iPhone After some outrage from the crowd. Bubbles is ... NaN 7 10 None None None None None
1844 675849018447167488 2015-12-13 01:25:37+00:00 Twitter for iPhone This dog is being demoted to a 9/10 for not we... NaN 9 10 None None None None None
1895 674742531037511680 2015-12-10 00:08:50+00:00 Twitter for iPhone Some clarification is required. The dog is sin... NaN 11 10 None None None None None
1905 674606911342424069 2015-12-09 15:09:55+00:00 Twitter for iPhone The 13/10 also takes into account this impecca... NaN 13 10 None None None None None
1914 674330906434379776 2015-12-08 20:53:11+00:00 Twitter for iPhone 13/10\n@ABC7 NaN 13 10 None None None None None
1940 673716320723169284 2015-12-07 04:11:02+00:00 Twitter for iPhone The millennials have spoken and we've decided ... NaN 1 10 None None None None None
2038 671550332464455680 2015-12-01 04:44:10+00:00 Twitter for iPhone After 22 minutes of careful deliberation this ... NaN 1 10 None None None None None
2149 669684865554620416 2015-11-26 01:11:28+00:00 Twitter for iPhone After countless hours of research and hundreds... NaN 11 10 None None None None None
2189 668967877119254528 2015-11-24 01:42:25+00:00 Twitter for iPhone 12/10 good shit Bubka\n@wane15 NaN 12 10 None None None None None
2298 667070482143944705 2015-11-18 20:02:51+00:00 Twitter for iPhone After much debate this dog is being upgraded t... NaN 10 10 None None None None None
In [115]:
df_archive.expanded_urls.isna().sum()
Out[115]:
54
In [116]:
duplicated_rows = df_archive[df_archive.duplicated('expanded_urls')]
duplicated_rows.head(10)
Out[116]:
tweet_id timestamp source text expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
55 881633300179243008 2017-07-02 21:58:53+00:00 Twitter for iPhone @roushfenway These are good dogs but 17/10 is ... NaN 17 10 None None None None None
64 879674319642796034 2017-06-27 12:14:36+00:00 Twitter for iPhone @RealKentMurphy 14/10 confirmed NaN 14 10 None None None None None
113 870726314365509632 2017-06-02 19:38:25+00:00 Twitter for iPhone @ComplicitOwl @ShopWeRateDogs &gt;10/10 is res... NaN 10 10 None None None None None
148 863427515083354112 2017-05-13 16:15:35+00:00 Twitter for iPhone @Jack_Septic_Eye I'd need a few more pics to p... NaN 12 10 None None None None None
179 857214891891077121 2017-04-26 12:48:51+00:00 Twitter for iPhone @Marc_IRL pixelated af 12/10 NaN 12 10 None None None None None
186 856288084350160898 2017-04-23 23:26:03+00:00 Twitter for iPhone @xianmcguire @Jenna_Marbles Kardashians wouldn... NaN 14 10 None None None None None
188 855862651834028034 2017-04-22 19:15:32+00:00 Twitter for iPhone @dhmontgomery We also gave snoop dogg a 420/10... NaN 420 10 None None None None None
189 855860136149123072 2017-04-22 19:05:32+00:00 Twitter for iPhone @s8n You tried very hard to portray this good ... NaN 666 10 None None None None None
218 850333567704068097 2017-04-07 13:04:55+00:00 Twitter for iPhone @markhoppus MARK THAT DOG HAS SEEN AND EXPERIE... NaN 13 10 None None None None None
228 848213670039564288 2017-04-01 16:41:12+00:00 Twitter for iPhone Jerry just apuppologized to me. He said there ... NaN 11 10 None None None None None
In [117]:
len(duplicated_rows)
Out[117]:
54

Code¶

In [118]:
df_archive = df_archive.dropna(subset = 'expanded_urls')
In [119]:
df_archive.drop_duplicates(subset = 'expanded_urls', inplace = True)

Test¶

In [120]:
print('Number of NaN in expanded URLs colum is : {}\nNumber of rows in expanded URLs colum is : {}\nNumber of unique values in expanded URLs colum is : {}'
      .format(df_archive.expanded_urls.isna().sum(), 
              df_archive.expanded_urls.nunique(), 
              len(df_archive.expanded_urls)))
Number of NaN in expanded URLs colum is : 0
Number of rows in expanded URLs colum is : 2098
Number of unique values in expanded URLs colum is : 2098

Define

I - tweet_id column in wrong data type -> convert into string

Code¶

In [121]:
df_image.tweet_id = df_image.tweet_id.astype(str) 

Test¶

In [122]:
df_image.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   object 
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(1), object(5)
memory usage: 152.1+ KB

Define

jpg_url has duplicated values -> drop duplicated values in jpg_url column

In [123]:
df_image.jpg_url.value_counts()
Out[123]:
https://pbs.twimg.com/media/CZhn-QAWwAASQan.jpg                                            2
https://pbs.twimg.com/media/Cq9guJ5WgAADfpF.jpg                                            2
https://pbs.twimg.com/ext_tw_video_thumb/807106774843039744/pu/img/8XZg1xW35Xp2J6JW.jpg    2
https://pbs.twimg.com/media/CU1zsMSUAAAS0qW.jpg                                            2
https://pbs.twimg.com/media/CsrjryzWgAAZY00.jpg                                            2
                                                                                          ..
https://pbs.twimg.com/media/CXrmMSpUwAAdeRj.jpg                                            1
https://pbs.twimg.com/media/CXrawAhWkAAWSxC.jpg                                            1
https://pbs.twimg.com/media/CXrIntsUsAEkv0d.jpg                                            1
https://pbs.twimg.com/media/CXqcOHCUQAAugTB.jpg                                            1
https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg                                            1
Name: jpg_url, Length: 2009, dtype: int64

Code¶

In [124]:
df_image.drop_duplicates(subset = 'jpg_url', inplace = True)

Test¶

In [125]:
df_image.jpg_url.value_counts()
Out[125]:
https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg                                            1
https://pbs.twimg.com/ext_tw_video_thumb/758467147756691456/pu/img/YTNzjRFDSPNXukmM.jpg    1
https://pbs.twimg.com/media/Coy87yiWYAACtPf.jpg                                            1
https://pbs.twimg.com/media/CovKqSYVIAAUbUW.jpg                                            1
https://pbs.twimg.com/media/CouEOZhWAAAgFpE.jpg                                            1
                                                                                          ..
https://pbs.twimg.com/media/CXmd_bsWkAEEXck.jpg                                            1
https://pbs.twimg.com/media/CXltdtaWYAIuX_V.jpg                                            1
https://pbs.twimg.com/media/CXlN1-EWMAQdwXK.jpg                                            1
https://pbs.twimg.com/media/CXk4W0qWYAMEMEs.jpg                                            1
https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg                                            1
Name: jpg_url, Length: 2009, dtype: int64
In [126]:
len(df_image[(df_image['p1_dog'] == False) & (df_image['p2_dog'] == False) & (df_image['p3_dog'] == False)])
Out[126]:
318

Define

Drop Values where none of the three predictions are dogs

Code¶

In [127]:
df_image = df_image[(df_image['p1_dog'] == True) | (df_image['p2_dog'] == True) | (df_image['p2_dog'] == True)]

Test¶

In [128]:
len(df_image[(df_image['p1_dog'] == False) & (df_image['p2_dog'] == False) & (df_image['p3_dog'] == False)])
Out[128]:
0

Tideness¶

In [129]:
df_archive.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2098 entries, 0 to 2355
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2098 non-null   object             
 1   timestamp           2098 non-null   datetime64[ns, UTC]
 2   source              2098 non-null   object             
 3   text                2098 non-null   object             
 4   expanded_urls       2098 non-null   object             
 5   rating_numerator    2098 non-null   int64              
 6   rating_denominator  2098 non-null   int64              
 7   name                2098 non-null   object             
 8   doggo               2098 non-null   object             
 9   floofer             2098 non-null   object             
 10  pupper              2098 non-null   object             
 11  puppo               2098 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(2), object(9)
memory usage: 213.1+ KB

Define

I - change values in 'doggo', 'floofer', 'pupper', 'puppo' columns from none into nan and merge them into one column

Code¶

In [130]:
df_archive = twitter_archive.copy()
df_archive.columns
Out[130]:
Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
      dtype='object')
In [131]:
names_columns = ['doggo', 'floofer', 'pupper', 'puppo']
names = df_archive[names_columns]
names = names.replace('None', np.NaN)
names['stage_name'] = names[names.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
names.drop(names_columns, axis = 1, inplace = True)
In [132]:
df_archive.drop(names_columns, axis = 1, inplace = True)
df_archive.tweet_id = df_archive.tweet_id.astype(str)
df_archive = pd.concat([df_archive, names], axis = 1)
df_archive.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2237 entries, 0 to 2355
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2237 non-null   object 
 1   in_reply_to_status_id       64 non-null     float64
 2   in_reply_to_user_id         64 non-null     float64
 3   timestamp                   2237 non-null   object 
 4   source                      2237 non-null   object 
 5   text                        2237 non-null   object 
 6   retweeted_status_id         178 non-null    float64
 7   retweeted_status_user_id    178 non-null    float64
 8   retweeted_status_timestamp  178 non-null    object 
 9   expanded_urls               2190 non-null   object 
 10  rating_numerator            2237 non-null   int64  
 11  rating_denominator          2237 non-null   int64  
 12  name                        2237 non-null   object 
 13  stage_name                  2237 non-null   object 
dtypes: float64(4), int64(2), object(8)
memory usage: 262.1+ KB
In [133]:
df_archive['stage_name'] = df_archive['stage_name'].replace('', np.NaN)
df_archive.stage_name.isna().sum()
Out[133]:
1861

Test¶

In [134]:
df_archive.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2237 entries, 0 to 2355
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2237 non-null   object 
 1   in_reply_to_status_id       64 non-null     float64
 2   in_reply_to_user_id         64 non-null     float64
 3   timestamp                   2237 non-null   object 
 4   source                      2237 non-null   object 
 5   text                        2237 non-null   object 
 6   retweeted_status_id         178 non-null    float64
 7   retweeted_status_user_id    178 non-null    float64
 8   retweeted_status_timestamp  178 non-null    object 
 9   expanded_urls               2190 non-null   object 
 10  rating_numerator            2237 non-null   int64  
 11  rating_denominator          2237 non-null   int64  
 12  name                        2237 non-null   object 
 13  stage_name                  376 non-null    object 
dtypes: float64(4), int64(2), object(8)
memory usage: 262.1+ KB
In [135]:
df_json = df_json.rename(columns = {'id':'tweet_id'})
df_json.tweet_id = df_json.tweet_id.astype(str)
df_json.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615 entries, 0 to 1614
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_id        1615 non-null   object
 1   retweet_count   1615 non-null   int64 
 2   favorite_count  1615 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 38.0+ KB

Define

II - merge all tables into one table

Code¶

In [136]:
df = df_archive.merge(df_image, how = 'outer', on = 'tweet_id').merge(df_json, how = 'outer', on = 'tweet_id')

Test¶

In [137]:
df.head()
Out[137]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls ... p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog retweet_count favorite_count
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... ... NaN NaN NaN NaN NaN NaN NaN NaN 6973.0 33703.0
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... ... 0.323581 True Pekinese 0.090647 True papillon 0.068957 True 5276.0 29229.0
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN NaN NaN https://twitter.com/dog_rates/status/891815181... ... 0.716012 True malamute 0.078253 True kelpie 0.031379 True 3465.0 21976.0
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN NaN NaN https://twitter.com/dog_rates/status/891689557... ... 0.170278 False Labrador_retriever 0.168086 True spatula 0.040836 False 7193.0 36799.0
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN NaN NaN https://twitter.com/dog_rates/status/891327558... ... 0.555712 True English_springer 0.225770 True German_short-haired_pointer 0.175219 True 7719.0 35186.0

5 rows × 27 columns

In [138]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2340 entries, 0 to 2339
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2340 non-null   object 
 1   in_reply_to_status_id       64 non-null     float64
 2   in_reply_to_user_id         64 non-null     float64
 3   timestamp                   2237 non-null   object 
 4   source                      2237 non-null   object 
 5   text                        2237 non-null   object 
 6   retweeted_status_id         178 non-null    float64
 7   retweeted_status_user_id    178 non-null    float64
 8   retweeted_status_timestamp  178 non-null    object 
 9   expanded_urls               2190 non-null   object 
 10  rating_numerator            2237 non-null   float64
 11  rating_denominator          2237 non-null   float64
 12  name                        2237 non-null   object 
 13  stage_name                  376 non-null    object 
 14  jpg_url                     1629 non-null   object 
 15  img_num                     1629 non-null   float64
 16  p1                          1629 non-null   object 
 17  p1_conf                     1629 non-null   float64
 18  p1_dog                      1629 non-null   object 
 19  p2                          1629 non-null   object 
 20  p2_conf                     1629 non-null   float64
 21  p2_dog                      1629 non-null   object 
 22  p3                          1629 non-null   object 
 23  p3_conf                     1629 non-null   float64
 24  p3_dog                      1629 non-null   object 
 25  retweet_count               1615 non-null   float64
 26  favorite_count              1615 non-null   float64
dtypes: float64(12), object(15)
memory usage: 511.9+ KB

Storing Data¶

In [139]:
df.to_csv('twitter_archive_master.csv', index = False)

Analyzing and Visualizing Data¶

In [140]:
df.img_num.value_counts()
Out[140]:
1.0    1385
2.0     165
3.0      53
4.0      26
Name: img_num, dtype: int64
In [141]:
df.describe()
Out[141]:
in_reply_to_status_id in_reply_to_user_id retweeted_status_id retweeted_status_user_id rating_numerator rating_denominator img_num p1_conf p2_conf p3_conf retweet_count favorite_count
count 6.400000e+01 6.400000e+01 1.780000e+02 1.780000e+02 2237.000000 2237.000000 1629.000000 1629.000000 1629.000000 1.629000e+03 1615.000000 1615.000000
mean 7.449206e+17 2.454770e+16 7.731397e+17 1.262626e+16 10.985248 10.003576 1.214242 0.606673 0.138292 6.175369e-02 2767.946749 8057.959133
std 7.465588e+16 1.381047e+17 6.187355e+16 9.678889e+16 1.661211 0.111846 0.573916 0.265185 0.101527 5.215186e-02 4253.716518 11845.717164
min 6.658147e+17 1.185634e+07 6.671383e+17 7.832140e+05 6.000000 10.000000 1.000000 0.044333 0.000010 2.160900e-07 1.000000 0.000000
25% 6.754602e+17 4.400207e+08 7.273831e+17 4.196984e+09 10.000000 10.000000 1.000000 0.379797 0.055527 1.596500e-02 406.000000 730.000000
50% 7.062329e+17 4.196984e+09 7.805335e+17 4.196984e+09 11.000000 10.000000 1.000000 0.607401 0.122019 5.059200e-02 1484.000000 2960.000000
75% 8.216960e+17 4.196984e+09 8.206409e+17 4.196984e+09 12.000000 10.000000 1.000000 0.852088 0.199619 9.521820e-02 3397.500000 11175.000000
max 8.862664e+17 8.405479e+17 8.874740e+17 7.874618e+17 15.000000 15.000000 4.000000 0.999984 0.467678 2.734190e-01 51446.000000 123705.000000
In [142]:
labels = ['iPhone', 'Vine', 'Twitter Web Client', 'TweetDeck']
plt.figure(figsize = (10,10))
plt.pie(df['source'].value_counts(), labels = labels, autopct = '%1.2f', textprops={'fontsize': 18})
plt.yticks(fontsize=20)
plt.title('Most popular source', fontsize = 18);
In [143]:
plt.figure(figsize = (15,25))
breed = df.groupby('p1').filter(lambda x: len(x) > 10)
breed.p1.value_counts(normalize = True).plot(kind = 'barh');
plt.title('Most popular dog breed')
plt.xlabel('Count')
plt.ylabel('Dog Breed');
In [144]:
plt.figure(figsize = (10,10))
sns.regplot(data = df, x = 'retweet_count', y = 'favorite_count',
           scatter_kws={"color": "blue"}, line_kws={"color": "red"})
plt.title('Corelation between retweet count and favorite count')
plt.xlabel('Retweets Count')
plt.ylabel('Favorite Count');
In [ ]: